In [1]:
# imports
from os.path import join
import pandas as pd
from sklearn.model_selection import train_test_split
from discover_ import Discover
from discover.utils.Timer import Timer

dummy_run = False
disc = Discover(dummy_run=dummy_run)

# load validation data
# HACK: absolute path while still working out dependency structure
data_dir = join("CrabNet", "data", "materials_data", "elasticity")
name = "train.csv"  # "example_materials_property_val_output.csv", #elasticity_val_output.csv"
fpath = join(data_dir, name)
df = pd.read_csv(fpath)

# df = df.groupby(by="formula", as_index=False).mean()
# if there are two compounds with the same formula, we're more interested in the higher GPa
group_filter = "max"  # "mean"
grp_df = (
    df.reset_index()
    .groupby(by="formula")
    .agg({"index": lambda x: tuple(x), "target": "max"})
    .reset_index()
)

# REVIEW: drop pure elements here?

# take small subset
if dummy_run:
    n = 100
    n2 = 10
    train_df = grp_df.iloc[:n, :]
    val_df = grp_df.iloc[n : n + n2, :]
else:
    # REVIEW: consider changing train_size to 0.2
    train_df, val_df = train_test_split(grp_df, train_size=0.8)
In [2]:
# slower if umap_random_state is not None
with Timer("DISCOVER-fit"):
    disc.fit(train_df)
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1667: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
Model architecture: out_dims, d_model, N, heads
3, 512, 3, 4
Running on compute device: cuda:0
Model size: 11987206 parameters

Generating EDM: 100%|██████████| 8572/8572 [00:00<00:00, 156165.76formulae/s]
loading data with up to 6 elements in the formula
training with batchsize 512 (2**9.000)
stepping every 170 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/40 --- train mae: 53 val mae: 53
Epoch: 19/40 --- train mae: 12.7 val mae: 12.7
Epoch: 39/40 --- train mae: 9.64 val mae: 9.64
Saving network (test-property) to models/trained_models/test-property.pth
[train-CrabNet]
Elapsed: 89.82874

[DISCOVER-fit]
Elapsed: 89.82974

In [3]:
with Timer("DISCOVER-predict"):
    score = disc.predict(val_df)
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1773: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
Generating EDM: 100%|██████████| 8572/8572 [00:00<00:00, 182869.33formulae/s]
loading data with up to 6 elements in the formula
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1667: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
Generating EDM: 100%|██████████| 2143/2143 [00:00<00:00, 165285.55formulae/s]
loading data with up to 6 elements in the formula
Fitting mod_petti kernel matrix
Constructing distances
[fit-wasserstein]
Elapsed: 17.19802

C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\umap\umap_.py:1735: UserWarning: using precomputed metric; transform will be unavailable for new data and inverse_transform will be unavailable for all data
  warn(
[fit-UMAP]
Elapsed: 45.07268

C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\umap\umap_.py:1735: UserWarning: using precomputed metric; transform will be unavailable for new data and inverse_transform will be unavailable for all data
  warn(
[fit-vis-UMAP]
Elapsed: 17.08833

[HDBSCAN*]
Elapsed: 0.14162

[pdf-summation]
Elapsed: 10.41885

[train-val-pdf-summation]
Elapsed: 1.89099

[DISCOVER-predict]
Elapsed: 96.64376

In [4]:
with Timer("DISCOVER-plot"):
    disc.plot()
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\plotly\graph_objs\_deprecations.py:378: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


[DISCOVER-plot]
Elapsed: 6.43417